function [last_episode,all_episodes]=simulation_exp(alpha,beta,T,vh,vl,mu_l,sigma_l,grid_middle,tick,grid_size)

%Runs a learning experiment once, for T episodes of 2 periods, in the 2-AMM case.

%This function returns:

%last_episode, a 1x22 vector with the following variables in columns:
%1: Price actually played by AMM 1 in episode T, tau = 1
%2: Price actually played by AMM 1 in episode T, tau = 2
%3: Price actually played by AMM 2 in episode T, tau = 1
%4: Price actually played by AMM 2 in episode T, tau = 2
%5: Greedy price of AMM1 in episode T, tau = 1
%6: Greedy price of AMM1 in episode T, tau = 2, s_1 = NT
%7: Greedy price of AMM1 in episode T, tau = 2, s_1 = 0
%8: Greedy price of AMM1 in episode T, tau = 2, s_1 = 1/2
%9: Greedy price of AMM1 in episode T, tau = 2, s_1 = 1
%10: Greedy price of AMM2 in episode T, tau = 1
%11: Greedy price of AMM2 in episode T, tau = 2, s_2 = NT
%12: Greedy price of AMM2 in episode T, tau = 2, s_2 = 0
%13: Greedy price of AMM2 in episode T, tau = 2, s_2 = 1/2
%14: Greedy price of AMM2 in episode T, tau = 2, s_2 = 1
%15: Equal to 1 if s_1 = NT, 0 otherwise
%16: Equal to 1 if s_1 = 0, 0 otherwise
%17: Equal to 1 if s_1 = 1/2, 0 otherwise
%18: Equal to 1 if s_1 = 1, 0 otherwise
%19: v_tilde
%20: l_tilde_1
%21: l_tilde_2
%22: C_t (number of consecutive episodes before T in which both AMMs have the same greedy prices in all states as in T).

%all_episodes, a Tx40 matrix with the values in all episodes 1 to T of the
%following variables, in columns:

%1: Price actually played by AMM 1 in episode T, tau = 1
%2: Price actually played by AMM 1 in episode T, tau = 2
%3: Price actually played by AMM 2 in episode T, tau = 1
%4: Price actually played by AMM 2 in episode T, tau = 2
%5: Greedy price of AMM1 in episode T, tau = 1
%6: Greedy price of AMM1 in episode T, tau = 2, s_1 = NT
%7: Greedy price of AMM1 in episode T, tau = 2, s_1 = 0
%8: Greedy price of AMM1 in episode T, tau = 2, s_1 = 1/2
%9: Greedy price of AMM1 in episode T, tau = 2, s_1 = 1
%10: Greedy price of AMM2 in episode T, tau = 1
%11: Greedy price of AMM2 in episode T, tau = 2, s_2 = NT
%12: Greedy price of AMM2 in episode T, tau = 2, s_2 = 0
%13: Greedy price of AMM2 in episode T, tau = 2, s_2 = 1/2
%14: Greedy price of AMM2 in episode T, tau = 2, s_2 = 1
%15: Equal to 1 if s_1 = NT, 0 otherwise
%16: Equal to 1 if s_1 = 0, 0 otherwise
%17: Equal to 1 if s_1 = 1/2, 0 otherwise
%18: Equal to 1 if s_1 = 1, 0 otherwise
%19: a_min1 (ask price in tau = 1)
%20: a_min2 (ask price in tau = 2)

%Columns 21 to 40 contain the squared values of the variables in columns 1 to 20, in the same order.

%Pre-allocate
all_episodes = zeros(T,40);
all_episodes(:,15) = ones(T,1); %By default the state in t=2 is recorded as "no trade".
%When there is a trade this value is replaced by zero.

%Inizialize Q-matrix with random values higher than monopoly expected
%profit. With the baseline parameters we have 139 rows (one for each price) and 2 columns (one for each AMM).

Q_n = 3+(6-3)*rand(2*grid_size+1,10); %Q-matrix for all players. 

% The columns of the Q-matrix correspond to: 
%1:  AMM 1, tau=1
%2:  AMM 1, tau=2 after no trade (s_1 = NT)
%3:  AMM 1, tau=2 after trade but 0 inventory (s_1 = 0)
%4:  AMM 1, tau=2 after trade and 1/2 inventory (s_1 = 1/2)
%5:  AMM 1, tau=2 after trade and 1 inventory (s_1 = 1)
%6:  AMM 2, tau=1
%7:  AMM 2, tau=2 after no trade (s_2 = NT)
%8:  AMM 2, tau=2 after trade but 0 inventory (s_2 = 0)
%9:  AMM 2, tau=2 after trade and 1/2 inventory (s_2 = 1/2)
%10: AMM 2, tau=2 after trade and 1 inventory (s_2 = 1)

%Generate a vector of T observation with prob=0.5 to be vl and prob=0.5 to be vh.
v_tilde = randsample([vl, vh], T, true)';             %compute the value of the asset in each episode
l  = normrnd(mu_l,sigma_l,T,2);                       %T x 2 matrix of random draws of l in each episode, 2 periods
vc = v_tilde +l;                                      %T x 2 matrix of investor valuations in each episode

%Generate vector to determine experimentation episodes
epsilon = exp(-beta*(1:1:T));               %Vector of experimentation probabilities

%Generate T x 4 matrix with 1 indicating experimentation at episode t.
%The columns correspond to:
%1: AMM 1, t=1
%2: AMM 1, t=2
%3: AMM 2, t=1
%4: AMM2 2, t=2

Experiment = zeros(T,4);

parfor i=1:4
    Experiment(:,i) = binornd(1,epsilon);  
end

%Loop over all episodes.
for t = 1:T

%%%%%%%%%%%%%%%%%%%%%%%%%%%% Period tau = 1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    %We compute the greedy price in episode t, tau =1, for each AMM.
    
    s = zeros(1,2);  %We preallocate the index of price actually chosen by each player.
    I1 = zeros(1,2); %We preallocate the inventory in t=1 of each player.
    R1 = zeros(1,2); %We preallocate the revenue in t=1 for each player.
    S = 2*ones(1,2); %We preallocate the t=2 state for each player, by default it is 2 (2 means no trade, 3 trade but 0 inventory, 4 trade and 1/2 inventory, 5 trade and 1 inventory)

    for i=1:2 %loop over AMMs
        maxvector = find(Q_n(:,5*(i-1)+1) == max(Q_n(:,5*(i-1)+1))); %Indices of all the values corresponding to a maximum. Column 5*(i-1)+1 indicates tau = 1 for AMM i
        s(1,i)  = maxvector(randi([1 length(maxvector)],1,1)); %Randomize if there are multiple greedy prices
        all_episodes(t,4+5*(i-1)+1) = (grid_middle - grid_size*tick - tick)+(s(1,i)*tick); %Convert the price index 
        %into the actual price and record the greedy price of AMM i at
        %episode t and period tau = 1 (column 5+5*(i-1)).   

    %Compute the actual price chosen by AMM i based on experimentation or exploitation 
    %If Experiment=1 AMM i explores at this round. 

        if Experiment(t,2*(i-1)+1) == 1 %Column 2*(i-1)+1 corresponds to period tau = 1 for AMM i
            s(1,i)  = randi([1 (2*grid_size+1)],1,1); %Generates a random index in the range 1 to 2*grid_size+1
            all_episodes(t,2*(i-1)+1) = (grid_middle - grid_size*tick - tick) + tick*s(1,i); %Records the corresponding price as the price quoted by AMM i in tau =1

     %If Experiment=0 the player exploits at this round. 
        else
            all_episodes(t,2*(i-1)+1) = all_episodes(t,4+5*(i-1)+1) ; %Records the greedy price as the actual price quoted by AMM i in tau = 1
        end
    end %end of the loop over AMMs


    %We compute the state and the revenue in episode t, tau = 1. 
    %Investor buys if vc is greater than the best price a_min, such that revenue is a_min if vc >= a_min, and zero otherwise.
    %If players set the same price the revenue is split between the players.
    %Note that to compute profit we need to subtract v_tilde, which be done in tau = 2.

    all_episodes(t,19) = min(all_episodes(t,1),all_episodes(t,3)); %compute the lowest price.

    %Compute inventories of both AMMs. Note that in other cases inventories
    %are indeed zero. Note that if there is no trade then I (inventories)
    %and S (state) just remain equal to their pre-allocated values (0 for
    %inventories, 2 for states).

    if all_episodes(t,19) <= vc(t,1) %Condition on the customer buying, hence there is a trade in tau =1
    index = find(all_episodes(t,19) == [all_episodes(t,1) all_episodes(t,3)]); %find all AMMs that set a_min
    m = size(index,2);  %compute the number of AMMs that set a_min

        for i=index %loop over AMMs who set the best price
            I1(1,i) = 1 / m; %compute inventories, either 1/2 or 1 depending on how many AMMs set the best price
            R1(1,i) = I1(1,i)*all_episodes(t,19); %compute revenues in t=1, inventory times price
        end

        for i=1:2 %loop over both AMMs
            S(1,i) = S(1,i) + 1 + 2*I1(1,i); %compute the state for each player. 
            %Remember S is initialized at 2. If there is no trade this line is not executed (because of line 145) and S remains 2.
            %If there is a trade this line returns S = 3 for no inventory, 4 for 1/2 inventory, and 5 for 1 inventory.
            %Note: if extending the code to N AMMs this needs to be done differently.
        end

        %Record the state from the perspective of player 1:
        all_episodes(t,15) = 0; %Remove the pre-allocated value of 1 in column 15, which corresponds to state s_1 = NT
        all_episodes(t,13+S(1,1)) = 1; %Record 1 for the state that actually occurred. S(1,1) is between 3 and 5 (see line 157).
        %For S = 5 for instance we record 1 in column 18 (trade and s_1 =1)

    end %end of the case in which the customer buys and there is a trade in tau =1

%Update the Q matrix for each player.

for i=1:2
    [value,~]  = max(Q_n(:,5*(i-1)+S(1,i))); %Take the maximum Q-value of AMM i in tau = 2 and the state that realized in tau = 1
    %For instance if i = 1 and S = 3 (trade but 0 inventory for AMM 1), pick the
    %maximum value of column 3 in the Q-matrix, which is consistent with
    %line 70.

    %Update the q-value associated with the price actually played by player
    %i. Take into account the period 1 revenue + the value of starting the
    %next period in state S(1,i).

    Q_n(s(1,i),5*(i-1)+1) = (1-alpha)*Q_n(s(1,i),5*(i-1)+1) + alpha*(R1(1,i) + value);
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%% Period tau = 2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

s = zeros(1,2); %We preallocate the index of price chosen by each player.
I2 = zeros(1,2); %We preallocate the inventory in period 2 for each player 
maxindex=zeros(4,2); %We preallocate the index of the greedy price in every period 2 state for each player

%We compute and record the greedy price in every period-2 state

for i=1:2 %loop over both AMMs
    for j=2:5 %loop over all period-2 states of the Q-matrix
    maxvector = find(Q_n(:,5*(i-1)+j) == max(Q_n(:,5*(i-1)+j))); %Indices of all the values corresponding to a maximum. Column 5*(i-1)+j indicates state j between 2 and 5 (period-2 states) for AMM i
    maxindex(j-1,i)  = maxvector(randi([1 length(maxvector)],1,1)); %Randomize if there are multiple greedy prices. Remember period-2 states are indexed from 2 to 5 (1 in period-1), hence the j-1.
    all_episodes(t,4+5*(i-1)+j) = (grid_middle - grid_size*tick - tick)+(maxindex(j-1,i)*tick); %Convert the price index 
        %into the actual price and record the greedy price of AMM i at
        %episode t, period tau = 2, and state j. For i = 1 and j = 2 for
        %instance we use the price that maximizes column 2 of the Q-matrix
        %to update column 6 of all_episodes. In both cases this corresponds
        %to AMM 1, t=2, s_1 = NT (lines 39 and 69).
    end %end of the loop over period-2 states
end % end of the loop over AMMs



    for i=1:2
    %Compute the actual price chosen by AMM i based on experimentation or exploitation 
    %If Experiment=1 AMM i explores at this round. 
        if Experiment(t,2*(i-1)+2) == 1  %Column 2*(i-1)+2 corresponds to period tau = 2 for AMM i
            s(1,i)  = randi([1 (2*grid_size+1)],1,1); %Generates a random index in price range
            all_episodes(t,2*(i-1)+2) = (grid_middle - grid_size*tick - tick) + tick*s(1,i); %Records the corresponding price as the actual price in tau = 2

     %If Experiment=0 the player exploits at this round. 
        else
            s(1,i) = maxindex(S(1,i)-1,i); %Recall S(1,i) is the state inherited from tau = 1 for AMM i. For instance if i=1 and S(1,i) = 2 (no trade) we pick the greedy price index in the first line and column of maxindex
            all_episodes(t,2*(i-1)+2) = (grid_middle - grid_size*tick - tick) + tick*s(1,i);  %Records the greedy price as the actual price in tau = 2
        end
    end 

    %We compute the revenue in tau = 2 and the final profit. 
    %The customer buys if vc is greater than the best price a_min, such that revenue is a_min if [vc >= a_min], and zero otherwise.
    %If the players set the same price the revenue is split between the players.

    all_episodes(t,20) = min(all_episodes(t,2),all_episodes(t,4)); %compute the lowest price.

    %We will update the inventory I2 if a customer buys.

    if  all_episodes(t,20) <= vc(t,2) %Condition on the customer buying, hence there is a trade in tau =2
        index = find(all_episodes(t,20) == [all_episodes(t,2) all_episodes(t,4)]); %find all AMMs that set a_min
        m = size(index,2); %compute the number of AMMs that set a_min
        for i=index %loop over AMMs who set the best price
            I2(1,i) = 1 / m;  %compute inventories, either 1/2 or 1 depending on how many AMMs set the best price
            % Note that for other players or if the customer doesn't buy the inventory remains the pre-allocated value of zero.
        end
    end 

    %Update the q-value for each player in the right state. For instance if
    %there is no trade in tau = 1 then S(1,i) = 2 and we update column 2 of
    %the Q-matrix, with the row corresponding to the price chosen in tau =
    %2. The profit in period 2 is the revenu of period 2 minus the
    %accumulated inventory over both periods times the value of the asset.
    for i=1:2
        Q_n(s(1,i),5*(i-1)+S(1,i)) = (1-alpha)*Q_n(s(1,i),5*(i-1)+S(1,i)) + alpha*(I2(1,i)*all_episodes(t,20)-(I1(1,i)+I2(1,i))*v_tilde(t));
    end
end 

%Count the number of periods with unchanged greedy prices in all states for
%both AMMs

C_t=1;
while all_episodes(end,5:14) == all_episodes(end-C_t,5:14)
    C_t = C_t+1;
end

%Compute the squared values of the variables in all_episodes:
all_episodes(:,21:40) = all_episodes(:,1:20).^2;

%Record the last episode
last_episode = [all_episodes(end,1:18), v_tilde(end,1), l(end,1), l(end,2), C_t];

